Understand which values within the metadata are correlated. Make sure you understand how these things are related in reality and mathematically.
tellme <- function(name){message("Package ", name, " version: ", packageVersion(name))}
library(tidyr); tellme("tidyr")
## Package tidyr version: 1.3.0
suppressPackageStartupMessages(library(dplyr)); tellme("dplyr")
## Package dplyr version: 1.1.2
library(ggplot2); tellme("ggplot2")
## Package ggplot2 version: 3.4.2
Direct output.
Output will be saved to: ../output
Get the per-particpant metadata.
pipeline = "../../"
prevModule = dir(path=pipeline, pattern="Participant_Metadata", full.names = TRUE)
fileName = file.path(prevModule, "output", "ANIGMA-metadata_by_AN_participant.txt")
diffs = read.delim(fileName)
Read in data from file:
../..//01_Participant_Metadata/output/ANIGMA-metadata_by_AN_participant.txt
This table has 118 rows and 25 columns.
Read per-sample data.
meta = read.delim("../../input/meta/ANIGMA-metadata.txt") %>%
select(PARTICIPANT.ID, LOCATION, TIMEPOINT, AGE, SUBTYPE, BMI,
STAI_Y1, STAI_Y2, STAI_TOTAL, PSS, DAYS_TREAT, Weight_kg,
DUR_ILLNESS_YRS)
dim(meta)
## [1] 352 13
This meta data has 352 rows and 13
columns.
The HC equivalent data frame:
HC = meta %>% filter(TIMEPOINT=="HC") %>%
select(PARTICIPANT.ID, LOCATION, AGE, SUBTYPE, STAI_Y1, STAI_Y2, STAI_TOTAL, PSS, BMI, Weight_kg)
This subset of the data has 136 rows and 10
columns.
Create a correlation matrix.
df = diffs %>% select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)
cor.co = cor(df, use="pairwise.complete.obs", method = "pearson")
# much of this is taken from
# http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization
#functions
tidyMelt <- function(cormat, values_name="values"){
cormat.new = cormat %>% data.frame()
cormat.new$var1 = row.names(cormat.new)
melted_cormat = cormat.new %>% pivot_longer(cols=-var1, names_to = "var2", values_to = values_name)
return(melted_cormat)
}
make_heatmap_1 <- function(cor.co2){
ggplot(data = cor.co2, aes(x=var1, y=var2, fill=cor)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation",
na.value = gray(.9)) +
theme_minimal()+ # minimal theme
xlab("") +
ylab("") +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1)) +
coord_fixed()
}
cor.co2 = tidyMelt(cor.co, values_name="cor")
make_heatmap_1(cor.co2)
Use only the upper triangle, and order features in a sensible way.
# actions
# Reorder the correlation matrix
cormat <- reorder_cormat(cor.co)
# upper_tri <- get_upper_tri(cormat)
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat = tidyMelt(triangle1, values_name="cor")
# order levels to match triangle
melted_cormat$var1 = factor(melted_cormat$var1, levels=row.names(triangle1))
melted_cormat$var2 = factor(melted_cormat$var2, levels=row.names(triangle1))
# Create a ggheatmap
make_heatmap_1(melted_cormat)
Manually set the order.
manual.order = c('BMI.T1', 'Weight_kg.T1', 'BMI.T2', 'Weight_kg.T2',
'T1.severity', 'DAYS_TREAT', 'BMI.diff', 'Weight_kg.diff',
'AGE', 'DUR_ILLNESS_YRS',
'PSS.diff', 'STAI_Y1.diff', 'STAI_Y2.diff', 'STAI_TOTAL.diff',
'PSS.T2',
'STAI_Y2.T1', 'PSS.T1', 'STAI_Y1.T1', 'STAI_TOTAL.T1', 'STAI_Y1.T2', 'STAI_Y2.T2', 'STAI_TOTAL.T2')
manual.order
## [1] "BMI.T1" "Weight_kg.T1" "BMI.T2" "Weight_kg.T2"
## [5] "T1.severity" "DAYS_TREAT" "BMI.diff" "Weight_kg.diff"
## [9] "AGE" "DUR_ILLNESS_YRS" "PSS.diff" "STAI_Y1.diff"
## [13] "STAI_Y2.diff" "STAI_TOTAL.diff" "PSS.T2" "STAI_Y2.T1"
## [17] "PSS.T1" "STAI_Y1.T1" "STAI_TOTAL.T1" "STAI_Y1.T2"
## [21] "STAI_Y2.T2" "STAI_TOTAL.T2"
Replot with manual order.
# Reorder the correlation matrix
cormat <- cor.co[manual.order, manual.order]
# upper_tri <- get_upper_tri(cormat)
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat = tidyMelt(triangle1, values_name="cor")
# order levels to match manual order
melted_cormat$var1 = factor(melted_cormat$var1, levels=row.names(triangle1))
melted_cormat$var2 = factor(melted_cormat$var2, levels=row.names(triangle1))
# Create a ggheatmap
make_heatmap_1(melted_cormat)
Now lets get some p-values.
feats = colnames(df)
pvalCut = 0.01
cors = matrix(data=NA, nrow=length(feats), ncol=length(feats), dimnames = list(feats, feats))
pvals = cors
cors.sig = cors
for (i in feats){
for (j in feats){
if ( i != j ){
res = cor.test(df[,i], df[,j], method="pearson")
cors[i,j] = res$estimate
pvals[i,j] = res$p.value
if (res$p.value < pvalCut){
cors.sig[i,j] = res$estimate
}
}
}
}
Make a new plot that only includes values were the correlation was
significant (p < 0.01).
reshapeMatrixToPlot <- function(correlationMatrix){
# Reorder the correlation matrix
cormat <- correlationMatrix[manual.order, manual.order]
# only one triangle
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat = tidyMelt(triangle1, values_name="cor")
# order levels to match triangle order
melted_cormat$var1 = factor(melted_cormat$var1, levels=row.names(triangle1))
melted_cormat$var2 = factor(melted_cormat$var2, levels=row.names(triangle1))
return(melted_cormat)
}
# match most recent plot - show melt and plot method matches
make_heatmap_1(reshapeMatrixToPlot(cor.co)) + ggtitle("Upper triangle")
# should also match most recent plot - show input matrix matches
make_heatmap_1(reshapeMatrixToPlot(cors)) + ggtitle("Upper triangle, excluding identity line")
# only sig
make_heatmap_1(reshapeMatrixToPlot(cors.sig)) + ggtitle(paste0("Correlations with p-value less than ", pvalCut))
Now lets note what is mathmatically related.
# T1 severity is related to BMI at T1 and therefore also related to weight at T1
severity = data.frame(x=c("T1.severity", "T1.severity"),
y=c("BMI.T1", "Weight_kg.T1"))
related = data.frame(x=c("STAI_TOTAL", "STAI_TOTAL", "BMI"),
y=c("STAI_Y1", "STAI_Y2", "Weight_kg"))
# these are related within a given time point
related1 = data.frame(x=paste(related$x, "T1", sep="."),
y=paste(related$y, "T1", sep="."))
related2 = data.frame(x=paste(related$x, "T2", sep="."),
y=paste(related$y, "T2", sep="."))
# by extension, the diff of each thing is related to each time point of the other thing
related1.diff = data.frame(x=paste(related$x, "T1", sep="."),
y=paste(related$y, "diff", sep="."))
related1.diff.rev = data.frame(x=paste(related$x, "diff", sep="."),
y=paste(related$y, "T1", sep="."))
related2.diff = data.frame(x=paste(related$x, "T2", sep="."),
y=paste(related$y, "diff", sep="."))
related2.diff.rev = data.frame(x=paste(related$x, "diff", sep="."),
y=paste(related$y, "T2", sep="."))
# all diffables have a relationship between their diff and each time point
# diffables = c("STAI_Y1", "STAI_Y2", "STAI_TOTAL", "PSS", "BMI", "Weight_kg")
diffables = gsub(grep(".diff", names(diffs), value = T), pattern=".diff", replacement="")
relatedDiff1 = data.frame(x=paste(diffables, "T1", sep="."),
y=paste(diffables, "diff", sep="."))
relatedDiff2 = data.frame(x=paste(diffables, "T2", sep="."),
y=paste(diffables, "diff", sep="."))
stack = rbind(related1, related2,
related1.diff, related1.diff.rev,
related2.diff, related2.diff.rev,
relatedDiff1, relatedDiff2,
severity)
# relatedness goes both ways
reverse.stack = data.frame(x=stack$y, y=stack$x)
stack2 = rbind(stack, reverse.stack)
head(stack2)
## x y
## 1 STAI_TOTAL.T1 STAI_Y1.T1
## 2 STAI_TOTAL.T1 STAI_Y2.T1
## 3 BMI.T1 Weight_kg.T1
## 4 STAI_TOTAL.T2 STAI_Y1.T2
## 5 STAI_TOTAL.T2 STAI_Y2.T2
## 6 BMI.T2 Weight_kg.T2
test annotations
make_heatmap_1(reshapeMatrixToPlot(cor.co)) +
# mark related features
annotate(geom="text", x=stack2$x, y=stack2$y, label="+") +
# identity line gets similar marking, slighly bigger
annotate(geom="text", x=feats, y=feats, label="+", size=6)
Show all correlations on the upper triangle but in the lower triangle show only the significant ones.
# upper - all correlation values
# Reorder the correlation matrix
cormat <- cor.co[manual.order, manual.order]
# only one triangle
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat1 = tidyMelt(triangle1, values_name="cor") %>% filter(!is.na(cor))
# order levels to match triangle order
melted_cormat1$var1 = factor(melted_cormat1$var1, levels=manual.order)
melted_cormat1$var2 = factor(melted_cormat1$var2, levels=manual.order)
# lower - only sig values
cormat <- cors.sig[manual.order, manual.order]
# only one triangle
get_lower_tri2<-function(cormat){
cormat[upper.tri(cormat)] <- 50
for (f in feats){
cormat[f, f] <- 50
}
return(cormat)
}
triangle2 = get_lower_tri2(cormat)
# Melt the correlation matrix
melted_cormat2 = tidyMelt(triangle2, values_name="cor") %>% filter(cor != 50)
# order levels to match triangle order
melted_cormat2$var1 = factor(melted_cormat2$var1, levels=manual.order)
melted_cormat2$var2 = factor(melted_cormat2$var2, levels=manual.order)
# merge in melted form
melted_cormat = rbind(melted_cormat1, melted_cormat2)
# order levels to match triangle order
melted_cormat$var1 = factor(melted_cormat$var1, levels=manual.order)
melted_cormat$var2 = factor(melted_cormat$var2, levels=manual.order)
figure = make_heatmap_1(melted_cormat) +
# mark related features
annotate(geom="text", x=stack2$x, y=stack2$y, label="+") +
# identity line gets similar marking, slighly bigger
annotate(geom="text", x=feats, y=feats, label="+", size=6)
print(figure)
The objects we need moving forward are:
Remove everything else.
rm(list = setdiff(ls(), c("manual.order", "stack2", "diffs", "HC", "outdir")))
As a function:
#functions
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
tidyMelt <- function(cormat, values_name="values"){
cormat.new = cormat %>% data.frame()
cormat.new$var1 = row.names(cormat.new)
melted_cormat = cormat.new %>% pivot_longer(cols=-var1, names_to = "var2", values_to = values_name)
return(melted_cormat)
}
make_heatmap_1 <- function(cor.co2){
ggplot(data = cor.co2, aes(x=var1, y=var2, fill=cor)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation",
na.value = gray(.9)) +
theme_minimal()+ # minimal theme
xlab("") +
ylab("") +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1)) +
coord_fixed()
}
make_heatmap_2 <- function(dataframe, my.order=manual.order, markPairings=stack2, pvalCut=0.01,
cor.mat.file=NULL, pval.mat.file=NULL){
if (is.null(my.order)){
my.order = names(dataframe)
}
feats = colnames(dataframe)
if (!is.null(markPairings)){
names(markPairings) = c("x", "y")
markPairings = markPairings %>%
filter( x %in% feats) %>%
filter( y %in% feats)
}
cor.co = cor(dataframe, use="pairwise.complete.obs", method = "pearson")
if (!is.null(cor.mat.file)){
write.table(cbind(feature=row.names(cor.co), cor.co),
file=cor.mat.file,
sep="\t", quote=F, row.names = F, col.names = T)
}
cors = matrix(data=NA, nrow=length(feats),
ncol=length(feats),
dimnames = list(feats, feats))
pvals = cors
cors.sig = cors
for (i in feats){
for (j in feats){
if ( i != j ){
res = cor.test(dataframe[,i], dataframe[,j], method="pearson")
pvals[i,j] = res$p.value
if (res$p.value < pvalCut){
cors.sig[i,j] = res$estimate
}
}
}
}
if (!is.null(pval.mat.file)){
write.table(cbind(feature=row.names(pvals), pvals),
file=pval.mat.file,
sep="\t", quote=F, row.names = F, col.names = T)
}
# upper - all correlation values
# Reorder the correlation matrix
cormat <- cor.co[my.order, my.order]
# only one triangle
triangle1 = get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat1 = tidyMelt(triangle1, values_name="cor") %>% filter(!is.na(cor))
# order levels to match triangle order
melted_cormat1$var1 = factor(melted_cormat1$var1, levels=my.order)
melted_cormat1$var2 = factor(melted_cormat1$var2, levels=my.order)
# lower - only sig values
cormat <- cors.sig[my.order, my.order]
# only one triangle
get_lower_tri2<-function(cormat){
cormat[upper.tri(cormat)] <- 50
for (f in feats){
cormat[f, f] <- 50
}
return(cormat)
}
triangle2 = get_lower_tri2(cormat)
# Melt the correlation matrix
melted_cormat2 = tidyMelt(triangle2, values_name="cor") %>% filter(cor != 50)
# order levels to match triangle order
melted_cormat2$var1 = factor(melted_cormat2$var1, levels=my.order)
melted_cormat2$var2 = factor(melted_cormat2$var2, levels=my.order)
# merge in melted form
melted_cormat = rbind(melted_cormat1, melted_cormat2)
# order levels to match triangle order
melted_cormat$var1 = factor(melted_cormat$var1, levels=my.order)
melted_cormat$var2 = factor(melted_cormat$var2, levels=my.order)
figure = make_heatmap_1(melted_cormat)
if (!is.null(markPairings)){
figure = figure +
# mark related features
annotate(geom="text", x=markPairings[,1], y=markPairings[,2], label="+") +
# identity line gets similar marking, slighly bigger
annotate(geom="text", x=feats, y=feats, label="+", size=6)
}
return(figure)
}
fig1 = make_heatmap_2(diffs %>% select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
cor.mat.file = file.path(outdir, "correlations_AN_cor.txt"),
pval.mat.file = file.path(outdir, "correlations_AN_pval.txt"))
fig1
Save figure to file.
ggsave(filename = file.path(outdir, "correlations_AN.png"),
plot = fig1)
## Saving 7 x 5 in image
Add LOCATION and SUBTYPE as a row.
make_heatmap_2(diffs %>%
mutate(LOC.N = as.numeric(factor(LOCATION))) %>%
mutate(TYPE.N = as.numeric(factor(SUBTYPE))) %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
my.order = c( "TYPE.N", "LOC.N", manual.order))
s1 = make_heatmap_2(diffs %>%
filter(SUBTYPE == "BP") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
ggtitle("BP")
s2 = make_heatmap_2(diffs %>%
filter(SUBTYPE == "ANR") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
ggtitle("ANR")
## not enough of these to plot
#
# make_heatmap_2(diffs %>%
# filter(SUBTYPE == "EDNOS") %>%
# select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
# ggtitle("EDNOS")
#
# make_heatmap_2(diffs %>%
# filter(SUBTYPE == "ARFID") %>%
# select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
# ggtitle("ARFID")
fileName.locations = file.path(outdir, "correlations_by-subtype.pdf")
pdf(fileName.locations)
s1
s2
dev.off()
## quartz_off_screen
## 2
s1
s2
Using the above function.
p1 = make_heatmap_2(diffs %>%
filter(LOCATION == "ACUTE") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE)) +
ggtitle("ACUTE")
# no ceed values for DUR_ILLNESS_YRS
p2 = make_heatmap_2(diffs %>%
filter(LOCATION == "CEED") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE, -DUR_ILLNESS_YRS),
my.order = setdiff(manual.order, "DUR_ILLNESS_YRS") ) +
ggtitle("CEED")
diffs.FARGO = diffs %>%
filter(LOCATION == "FARGO") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE) %>%
select(-contains("STAI_Y2"), -contains("STAI_TOTAL") )
p3 = make_heatmap_2(diffs.FARGO,
my.order=manual.order[manual.order %in% names(diffs.FARGO)],
markPairings = stack2 %>%
filter(x %in% names(diffs.FARGO)) %>%
filter(y %in% names(diffs.FARGO))) +
ggtitle("FARGO")
fileName.locations = file.path(outdir, "correlations_by-location.pdf")
pdf(fileName.locations)
p1
p2
p3
dev.off()
## quartz_off_screen
## 2
p1
p2
p3
hc1 = make_heatmap_2(HC %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
my.order = NULL, markPairings = stack2) +
ggtitle("Non-eating Disorder")
hc1
hc2 = make_heatmap_2(HC %>%
filter(LOCATION == "ACUTE") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
my.order = NULL) +
ggtitle("Non-eating Disorder - ACUTE")
hc3 = make_heatmap_2(HC %>%
filter(LOCATION == "CEED") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE),
my.order = NULL) +
ggtitle("Non-eating Disorder - CEED")
hc4 = make_heatmap_2(HC %>%
filter(LOCATION == "FARGO") %>%
select(-PARTICIPANT.ID, -LOCATION, -SUBTYPE) %>%
select(-contains("STAI_Y2"), -contains("STAI_TOTAL")),
my.order = NULL) +
ggtitle("Non-eating Disorder - FARGO")
fileName.locations = file.path(outdir, "correlations_non-ed_HC.pdf")
pdf(fileName.locations)
hc1
hc2
hc3
hc4
dev.off()
## quartz_off_screen
## 2
hc2
hc3
hc4
sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.4.2 dplyr_1.1.2 tidyr_1.3.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.3 jsonlite_1.8.4 highr_0.10 compiler_4.3.0
## [5] tidyselect_1.2.0 jquerylib_0.1.4 textshaping_0.3.6 systemfonts_1.0.4
## [9] scales_1.2.1 yaml_2.3.7 fastmap_1.1.1 R6_2.5.1
## [13] labeling_0.4.2 generics_0.1.3 knitr_1.42 tibble_3.2.1
## [17] munsell_0.5.0 bslib_0.4.2 pillar_1.9.0 rlang_1.1.1
## [21] utf8_1.2.3 cachem_1.0.8 xfun_0.39 sass_0.4.6
## [25] cli_3.6.1 withr_2.5.0 magrittr_2.0.3 digest_0.6.31
## [29] grid_4.3.0 rstudioapi_0.14 lifecycle_1.0.3 vctrs_0.6.2
## [33] evaluate_0.21 glue_1.6.2 farver_2.1.1 ragg_1.2.5
## [37] fansi_1.0.4 colorspace_2.1-0 rmarkdown_2.21 purrr_1.0.1
## [41] tools_4.3.0 pkgconfig_2.0.3 htmltools_0.5.5